Windows 10 Coin

train: (row: 1,347,190, columns: 1,085) test: (row: 374,136, columns: 1,084)

y value: if HasClicked == True, app 1.8%

How to run

  1. Put the train and test files in ..\input
  2. Put the script file in ..\script
  3. In Jupyter Notebook, run all and get submission file in the same script folder

In [66]:
# Timer and file info
import math
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc # We're gonna be clearing memory a lot
import matplotlib.pyplot as plt
import seaborn as sns
import random
import lightgbm as lgb
#from ml_metrics import mapk
import hashlib
from datetime import datetime
import re
import csv
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import ensemble
from sklearn import model_selection
from sklearn.metrics import matthews_corrcoef, f1_score, classification_report, confusion_matrix, precision_score, recall_score
%matplotlib inline

# Timer
class Timer:
    def __init__(self, text=None):
        self.text = text
        
    def __enter__(self):
        self.cpu = time.clock()
        self.time = time.time()
        if self.text:
            print("{}...".format(self.text))
            print(datetime.now())
        return self

    def __exit__(self, *args):
        self.cpu = time.clock() - self.cpu
        self.time = time.time() - self.time
        if self.text:
            print("%s: cpu %0.2f, time %0.2f\n" % (self.text, self.cpu, self.time))

# Split to train and holdout sets with counts
def sample_train_holdout(_df, sample_count, holdout_count):   
    random.seed(7)
    sample_RowNumber = random.sample(list(_df['RowNumber']), (sample_count + holdout_count))
    train_RowNumber = random.sample(sample_RowNumber, sample_count)
    holdout_RowNumber = list(set(sample_RowNumber) - set(train_RowNumber))
    holdout = _df[_df['RowNumber'].isin(holdout_RowNumber)].copy()
    _df = _df[_df['RowNumber'].isin(train_RowNumber)]
    return _df, holdout 

# Sampling for train and holdout with imbalanced binary label
def trainHoldoutSampling(_df, _id, _label, _seed=7, t_tr=0.5, t_ho=0.5, f_tr=0.05, f_ho=0.5):
    random.seed(_seed)
    
    positive_id = list(_df[_df[_label]==True][_id].values)
    negative_id = list(_df[_df[_label]==False][_id].values)
    
    train_positive_id = random.sample(positive_id, int(len(positive_id) * t_tr))
    holdout_positive_id = random.sample(list(set(positive_id)-set(train_positive_id)), int(len(positive_id) * t_ho)) 
    train_negative_id = random.sample(negative_id, int(len(negative_id) * f_tr))
    holdout_negative_id = random.sample(list(set(negative_id)-set(train_negative_id)), int(len(negative_id) * f_ho))
    
    train_id = list(set(train_positive_id)|set(train_negative_id))
    holdout_id = list(set(holdout_positive_id)|set(holdout_negative_id))
    
    print('train count: {}, train positive count: {}'.format(len(train_id),len(train_positive_id)))
    print('holdout count: {}, holdout positive count: {}'.format(len(holdout_id),len(holdout_positive_id)))
    
    return _df[_df[_id].isin(train_id)], _df[_df[_id].isin(holdout_id)]

def datetime_features2(_df, _col):
    _format='%m/%d/%Y %I:%M:%S %p'
    _df[_col] = _df[_col].apply(lambda x: datetime.strptime(x, _format))
    
    colYear = _col+'Year'
    colMonth = _col+'Month'
    colDay = _col+'Day'
    colHour = _col+'Hour'
    #colYearMonthDay = _col+'YearMonthDay'
    #colYearMonthDayHour = _col+'YearMonthDayHour' 
    
    _df[colYear] = _df[_col].apply(lambda x: x.year)
    _df[colMonth] = _df[_col].apply(lambda x: x.month)
    _df[colDay] = _df[_col].apply(lambda x: x.day)
    _df[colHour] = _df[_col].apply(lambda x: x.hour)
    
    #ymd = [colYear, colMonth, colDay]
    #ymdh = [colYear, colMonth, colDay, colHour]
    
    #_df[colYearMonthDay] = _df[ymd].apply(lambda x: '_'.join(str(x)), axis=1)
    #_df[colYearMonthDayHour] = _df[ymdh].apply(lambda x: '_'.join(str(x)), axis=1)

    return _df
    
# Change date column datetime type and add date time features
def datetime_features(_df, _col, isDelete = False):
    # 1. For years greater than 2017, create year folder with regex and change year to 2017 in datetime column
    # find and return 4 digit number (1st finding) in dataframe string columns
    year_col = _col + 'Year'
    _df[year_col] = _df[_col].apply(lambda x: int(re.findall(r"\D(\d{4})\D", " "+ str(x) +" ")[0]))
    years = sorted(list(_df[year_col].unique()))
    yearsGreaterThan2017 = sorted(i for i in years if i > 2017)

    # Two ways for strange year data (1) change it to 2017 temporarily (2) remove from data; we will go with (1)
    # because we cannot remove test rows anyway
    if isDelete:
        _df = _df[~_df[year_col].isin(yearsGreaterThan2017)]
    else:
        for i in yearsGreaterThan2017:
            print("replace ", i, " to 2017 for conversion")
            _df.loc[_df[year_col] == i, _col] = _df[_df[year_col] == i][_col].values[0].replace(str(i), "2017")
    
    # How to remove strange year rows
    # train = train[~train['year'].isin(yearsGreaterThan2017)]

    # 2. Convert string to datetime
    _df[_col] = pd.to_datetime(_df[_col])
    print(_col, "column conversion to datetime type is done")
    
    # 3. Add more date time features
    month_col = _col + 'Month'
    week_col = _col + 'Week'
    weekday_col = _col + 'Weekday'
    day_col = _col + 'Day'
    hour_col = _col + 'Hour'
    #year_month_day_col = _col + 'YearMonthDay'
    #year_month_day_hour_col = _col + 'YearMonthDayHour'
    
    _df[month_col] = pd.DatetimeIndex(_df[_col]).month
    _df[week_col] = pd.DatetimeIndex(_df[_col]).week
    _df[weekday_col] = pd.DatetimeIndex(_df[_col]).weekday
    _df[day_col] = pd.DatetimeIndex(_df[_col]).day
    _df[hour_col] = pd.DatetimeIndex(_df[_col]).hour
    #_df[year_month_day_col] = _df[[year_col, month_col, day_col]].apply(lambda x: ''.join(str(x)), axis=1)
    #_df[year_month_day_hour_col] = _df[[year_col, month_col, day_col, hour_col]].apply(lambda x: ''.join(str(x)), axis=1)
    print("year, month, week, weekday, day, hour features are added")
    
    return _df

# Delete rows with list condition for dataframe
def delRows(_df, _col, _list):
    _df = _df[~_df[_col].isin(_list)]
    return _df

import re

# Create new column using regex pattern for strings for dataframe
def addFeatureRegex(_df, _col, _newCol):
    _df[_newCol] = _df[_col].apply(lambda x: int(re.findall(r"\D(\d{4})\D", " "+ str(x) +" ")[0]))
    return _df

# Convert string to datetime type
def stringToDatetime(_df, _col):
    _df[_col] = _df[_col].astype('datetime64[ns]')
    return _df

# Add features from datetime
def addDatetimeFeatures(_df, _col):
    _df[_col + 'Year'] = pd.DatetimeIndex(_df[_col]).year
    _df[_col + 'Month'] = pd.DatetimeIndex(_df[_col]).month
    _df[_col + 'Week'] = pd.DatetimeIndex(_df[_col]).week
    _df[_col + 'Weekday'] = pd.DatetimeIndex(_df[_col]).weekday
    _df[_col + 'Day'] = pd.DatetimeIndex(_df[_col]).day
    _df[_col + 'Hour'] = pd.DatetimeIndex(_df[_col]).hour
    return _df

# Get categorical column names
def categoricalColumns(_df):
    cat_columns = _df.select_dtypes(['object']).columns
    print("Categorical column count:", len(cat_columns))
    print("First 5 values:", cat_columns[:5])
    return cat_columns

# Get column names starting with
def columnsStartingWith(_df, _str):
    sorted_list = sorted(i for i in list(_df) if i.startswith(_str))
    print("Column count:", len(sorted_list))
    print("First 5 values:", sorted_list[:5])    
    return sorted_list

# Get column names ending with
def columnsEndingWith(_df, _str):
    sorted_list = sorted(i for i in list(_df) if i.endswith(_str))
    print("Column count:", len(sorted_list))
    print("First 5 values:", sorted_list[:5])    
    return sorted_list

# Get constant columns
def constantColumns(_df):
    constant_list = []
    cols = list(_df) # same as _df.columns.values
    for col in cols:
        if len(_df[col].unique()) == 1:
            constant_list.append(col)
    print("Constant column count:", len(constant_list))
    print("First 5 values:", constant_list[:5])  
    return constant_list

# Add null columns
def makeNullColumns(_df, _cols):
    null_df = _df[_cols].isnull()
    null_df.columns = null_df.columns + 'Null'
    _df = pd.concat([_df, null_df], axis=1)
    return _df

# Union
def union(a, b):
    return list(set(a)|set(b))

def unique(a):
    return list(set(a))

# undersampling - sample rate 0.8 for 80% samling using isUndersampled column 
def underSampling(_df, _sample_rate):
    _df['isUnderSampled'] = 1
    _rand_num = 1/(1-_sample_rate)
    underSample = np.random.randint(_rand_num, size=len(_df[_df['HasClicked'] == 0]))
    _df.loc[_df['HasClicked'] == 0, 'isUnderSampled'] = underSample>0
    return _df

# Add column with value count
def valueCountColumn(_df, _col):
    _dict = dict([(i, a) for i, a in zip(_df[_col].value_counts().index, _df[_col].value_counts().values)])
    _df[_col+'ValueCount'] = _df[_col].apply(lambda x: _dict[x])
    return _df

# Add column with bool values to check if keyword is contained or not
def containColumn(_df, _col, _str):
    _df[_col+'Cotains'+_str] = _df[_col].str.contains(_str)
    return _df

# Feature engineering
def feature_engineering(_df):
    print("shape:", _df.shape)
    print("Add datetime features...")
    datetime_columns = ['BubbleShownTime', 'FirstUpdatedDate', 'OSOOBEDateTime']
    for col in datetime_columns:
        print(col)
        if _df[col].isnull().sum() > 0:
            _df[col] = _df[col].fillna('1/1/2017 11:11:11 AM')
        _df = datetime_features2(_df, col)

    print("shape:", _df.shape)

    gc.collect()
    
    # Null count
    print("Missing value count...")
    _df['CntNs'] = _df.isnull().sum(axis=1) 

    cols = ['AppCategoryNMinus1', 'AppCategoryNMinus2', 'AppCategoryNMinus3', 'AppCategoryNMinus4', 'AppCategoryNMinus5',
           'AppCategoryNMinus6', 'AppCategoryNMinus7', 'AppCategoryNMinus8']
    _df['AppCatCntNs'] = _df[cols].isnull().sum(axis=1)

    #_df[cols] = _df[cols].fillna("NA")
    #for col in cols:
    #    print(col)
    #    _df[col+'HighLevel'] = _df[col].apply(lambda x: str(x).split(':')[0])
   
    # Game segment parse with '.'
    # to-do: 2nd and 3rd parsed values to add as features later, some exception handling is needed
    print("Gamer segment parsing...")
    _df['GamerSegment1'] = _df['GamerSegment'].apply(lambda x: str(x).split('.')[0] if str(x).split('.') else 'Unknown')
    
    # Check creativeName contains keyword or not - Week 7 is removed because space is not addressed in lightgbm v2 model
    keywords = ['SL', 'TS', 'Week7', 'Meet', 'Skype', 'Battery', 'Switch', 'Performance', 'Security', 'Surge']
    for keyword in keywords:
        _df = containColumn(_df, 'creativeName', keyword)
    #_df['week7'] = _df['Week7'].values + _df['Week 7'].values
    #_df.drop(['Week7', 'Week 7'], axis = 1, inplace = True)
    
    # Convert categorical columns to numeric
    print("Convert categorical columns to numeric...")
    cat_columns = _df.select_dtypes(['object']).columns
    for cat_column in cat_columns:
        print(cat_column)
        if cat_column == 'creativeName':
            _df['creativeNameTest'] = _df['creativeName'].values
        #_df[cat_column] = _df[cat_column].apply(lambda x: abs(hash(x)) )
        _df[cat_column] = _df[cat_column].apply(lambda x: int(hashlib.sha1(str(x).encode('utf-8')).hexdigest(), 16) % (10 ** 16) )
    gc.collect()
    
    # Replace missing values with -1
    print("Replace missing values with -1")
    _df = _df.fillna(-1)
    
    # Value count
    print("Value count...")
    cols = ['UniqueUserDeviceKey', 'CampaignId']
    for col in cols:
        print(col)
        _df = valueCountColumn(_df, col)
        
    return _df

# Get best threshold value for F1 score
def f1_best_threshold(_actual, _pred):
    thresholds = np.linspace(0.01, 0.5, 1000)

    fc = np.array([f1_score(_actual, _pred>thr) for thr in thresholds])
    plt.plot(thresholds, fc)
    best_threshold = thresholds[fc.argmax()]
    print('f1 score:', fc.max())
    print('best threshold:', best_threshold)
    print('TF pred mean:', (_pred>best_threshold).mean())
    
    return best_threshold

In [67]:
with Timer("Read train data..."):
    train = pd.read_csv('CoinMlCompetitionSoftlandingTrainWithHeader.tsv', sep='\t') # (1347190, 1085)
    print(train.shape)
    
    test_header = train.columns[0:1084]


Read train data......
2017-04-07 05:56:46.235156
C:\Users\ryutek\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (2,8,17,33) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
(1347190, 1085)
Read train data...: cpu 209.50, time 209.52


In [68]:
df_header = pd.DataFrame(test_header)
df_header.to_csv('test_header.csv', index = False)

In [69]:
with Timer("Decide feature columns..."):
    # Reduce size by removing most of days and time features
    features = train.columns
    
    print("features without time_ and days_ columns")
    time_columns = columnsStartingWith(train, 'Time_')
    days_columns = columnsStartingWith(train, 'Days_')
    features = list(set(features) - set(time_columns))
    features = list(set(features) - set(days_columns))
    
    # Add important time features from feature importance and some validation
    imp_time_features = ['Time_Accessibility', 'Time_Browser', 'Time_Communications', 'Time_Content', 'Time_DevTools', 
                     'Time_Games', 'Time_Malware', 'Time_Media', 'Time_PersonalProductivity', 'Time_Readers', 
                     'Time_Search', 'Time_Social', 'Time_StudentAndLearning', 'Time_ModernApps', 
                     'Time_Games_Core', 'Time_Games_Casual', 'Time_windows_immersivecontrolpanel',
                     'Time_msascui_exe', 'Time_chrome_exe', 'Time_microsoft_windows_cortana', 'Time_lockapphost_exe']
    features = list(set(features) | set(imp_time_features))
    
    train = train[features]
    print(train.shape)
    
    df_features = pd.DataFrame(features)
    df_features.to_csv('initial_features.csv', index = False)
    
    #with open('feature_list', 'wb') as fp:
    #    pickle.dump(features, fp)


Decide feature columns......
2017-04-07 06:00:15.816642
features without time_ and days_ columns
Column count: 517
First 5 values: ['Time_0c72c7cd_217379cb4ae6f', 'Time_0c72c7cd_beta', 'Time_123freesolitaire_exe', 'Time_1cv8_exe', 'Time_2345explorer_exe']
Column count: 517
First 5 values: ['Days_0c72c7cd_217379cb4ae6f', 'Days_0c72c7cd_beta', 'Days_123freesolitaire_exe', 'Days_1cv8_exe', 'Days_2345explorer_exe']
(1347190, 72)
Decide feature columns...: cpu 0.68, time 0.67


In [70]:
# Train feature engineering
with Timer("Train feature engineering..."):
    #train = feature_engineering(train, isDeleteOddDateRows=True)
    train = feature_engineering(train)
    
    train_y = train['HasClicked'].values
    print("train y mean:", train_y.mean())


Train feature engineering......
2017-04-07 06:00:16.504007
shape: (1347190, 72)
Add datetime features...
BubbleShownTime
FirstUpdatedDate
OSOOBEDateTime
shape: (1347190, 84)
Missing value count...
Gamer segment parsing...
Convert categorical columns to numeric...
GamerSegment
BubbleShownTime
AppCategoryNMinus1
IsVirtualDevice
DefaultInternetBrowser
AppCategoryNMinus8
DisplayLanguage
AppCategoryNMinus7
AppCategoryNMinus3
GamerPCClassification
AppCategoryNMinus2
AppCategoryNMinus4
IsAlwaysOnAlwaysConnectedCapable
OSOOBEDateTime
AppCategoryNMinus5
IsDomainJoined
CampaignId
creativeName
ISOCountryShortName
AppCategoryNMinus6
DeviceTimezone
GamerSegment1
creativeNameCotainsSL
creativeNameCotainsTS
creativeNameCotainsWeek7
creativeNameCotainsMeet
creativeNameCotainsSkype
creativeNameCotainsBattery
creativeNameCotainsSwitch
creativeNameCotainsPerformance
creativeNameCotainsSecurity
creativeNameCotainsSurge
Replace missing values with -1
Value count...
UniqueUserDeviceKey
CampaignId
train y mean: 0.0183062522733
Train feature engineering...: cpu 239.30, time 239.31


In [71]:
with Timer("Read test and feature engineering..."):
    # Read tsv file
    test = pd.read_csv('CoinMlCompetitionSoftlandingEvaluateNoLabel.tsv', sep='\t', header = None)
    
    # Add header because test does not header
    test.columns = test_header
    
    # Reduce test size by leaving train features only
    test = test[list(set(features) - set(['HasClicked']))]
    
    # Feature engineering - should not delete odd date rows
    #test = feature_engineering(test, isDeleteOddDateRows=False)
    test = feature_engineering(test)
    
    print(test.shape)


Read test and feature engineering......
2017-04-07 06:04:15.831781
C:\Users\ryutek\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (2,17,33) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
shape: (374137, 71)
Add datetime features...
BubbleShownTime
FirstUpdatedDate
OSOOBEDateTime
shape: (374137, 83)
Missing value count...
Gamer segment parsing...
Convert categorical columns to numeric...
GamerSegment
BubbleShownTime
AppCategoryNMinus1
DefaultInternetBrowser
DisplayLanguage
AppCategoryNMinus3
GamerPCClassification
AppCategoryNMinus2
IsAlwaysOnAlwaysConnectedCapable
CampaignId
creativeName
AppCategoryNMinus6
AppCategoryNMinus8
AppCategoryNMinus7
AppCategoryNMinus4
AppCategoryNMinus5
IsDomainJoined
ISOCountryShortName
DeviceTimezone
GamerSegment1
creativeNameCotainsSL
creativeNameCotainsTS
creativeNameCotainsWeek7
creativeNameCotainsMeet
creativeNameCotainsSkype
creativeNameCotainsBattery
creativeNameCotainsSwitch
creativeNameCotainsPerformance
creativeNameCotainsSecurity
creativeNameCotainsSurge
Replace missing values with -1
Value count...
UniqueUserDeviceKey
CampaignId
(374137, 99)
Read test and feature engineering...: cpu 124.49, time 124.50


In [72]:
# Get column groups and features
all_columns = train.columns
print("All columns:", len(all_columns))

# Remove constant columns for train (all included in time_ and days_ columns)
print("features without constant columns")
constant_columns = constantColumns(train)
features = list(set(all_columns) - set(constant_columns))
print("features:", len(features))

# With a lot of nulls, exclude time and days columns first and add later for improvement
#print("features without time_ and days_ columns")
#time_columns = columnsStartingWith(train, 'Time_')
#days_columns = columnsStartingWith(train, 'Days_')
#features = list(set(features) - set(time_columns))
#features = list(set(features) - set(days_columns))

# Drop features
drop_features = ['HasClicked', 'RowNumber', 'BubbleShownTime', 'FirstUpdatedDate', 'OSOOBEDateTime', 'creativeNameTest'] 
features = list(set(features) - set(drop_features))

# Drop features
#HighLevel_features = columnsEndingWith(train, 'HighLevel') 
#features = list(set(features) - set(HighLevel_features))

# Drop features
#UniqueUserDeviceKey_features = ['UniqueUserDeviceKeyValueCount', 'CampaignIdValueCount', 'creativeNameValueCount'] 
#features = list(set(features) - set(UniqueUserDeviceKey_features))

# Add high importance Time features - To Do: will change to Avg 
'''
imp_time_features = ['Time_Accessibility', 'Time_Browser', 'Time_Communications', 'Time_Content', 'Time_DevTools', 
                     'Time_Games', 'Time_Malware', 'Time_Media', 'Time_PersonalProductivity', 'Time_Readers', 
                     'Time_Search', 'Time_Social', 'Time_StudentAndLearning', 'Time_ModernApps', 
                     'Time_Games_Core', 'Time_Games_Casual', 'Time_windows_immersivecontrolpanel',
                     'Time_msascui_exe', 'Time_chrome_exe', 'Time_microsoft_windows_cortana', 'Time_lockapphost_exe']
features = list(set(features) | set(imp_time_features))
'''

print("Final features:", len(features))

with open('feature_list_final', 'wb') as fp:
    pickle.dump(features, fp)


All columns: 100
features without constant columns
Constant column count: 0
First 5 values: []
features: 100
Final features: 94

In [73]:
df_final_features = pd.DataFrame(features)
df_final_features.to_csv('final_features.csv', index = False)

In [74]:
features


Out[74]:
['GamerSegment',
 'FirstUpdatedDateDay',
 'Time_StudentAndLearning',
 'AppCategoryNMinus1',
 'DefaultInternetBrowser',
 'LifeTimeConversionCount',
 'Time_Games_Casual',
 'HasRearFacingCamera',
 'InternalPrimaryDisplayLogicalDPIX',
 'Time_Games_Core',
 'OSOOBEDateTimeHour',
 'Time_Browser',
 'DisplayLanguage',
 'TotalEngagementTimeInSec',
 'creativeNameCotainsSL',
 'CntNs',
 'AppCategoryNMinus3',
 'Time_Social',
 'NumberofInternalDisplays',
 'TotalPhysicalRAM',
 'GamerPCClassification',
 'AppCategoryNMinus2',
 'Time_Accessibility',
 'IsAlwaysOnAlwaysConnectedCapable',
 'PrimaryDiskTotalCapacity',
 'BubbleShownTimeDay',
 'ProcessorClockSpeed',
 'ProcessorPhysicalCores',
 'OSOOBEDateTimeYear',
 'RecentMonthConversionCount',
 'IsEducation',
 'Time_Games',
 'creativeNameCotainsBattery',
 'NumberofExternalDisplays',
 'creativeNameCotainsMeet',
 'Time_Content',
 'creativeNameCotainsTS',
 'IsTouchEnabled',
 'CampaignId',
 'creativeName',
 'Time_Readers',
 'creativeNameCotainsSurge',
 'Time_windows_immersivecontrolpanel',
 'Time_PersonalProductivity',
 'Time_Search',
 'InternalPrimaryDisplayResolutionVertical',
 'Time_microsoft_windows_cortana',
 'Time_DevTools',
 'InternalPrimaryDisplayResolutionHorizontal',
 'AppCategoryNMinus6',
 'creativeNameCotainsSecurity',
 'FirstUpdatedDateYear',
 'UniqueUserDeviceKey',
 'creativeNameCotainsPerformance',
 'FirstUpdatedDateMonth',
 'IsCommercial',
 'AppCatCntNs',
 'GamerSegment1',
 'Time_ModernApps',
 'IsVirtualDevice',
 'creativeNameCotainsSwitch',
 'FrontFacingCameraResolution',
 'Time_Malware',
 'AppCategoryNMinus8',
 'UniqueUserDeviceKeyValueCount',
 'HasFrontFacingCamera',
 'AppCategoryNMinus7',
 'BubbleShownTimeYear',
 'CampaignIdValueCount',
 'creativeNameCotainsWeek7',
 'OSOOBEDateTimeDay',
 'AppCategoryNMinus4',
 'LifeTimeImpressionCount',
 'ProcessorCores',
 'FirstUpdatedDateHour',
 'TotalDays',
 'InternalPrimaryDiagonalDisplaySizeInInches',
 'OSOOBEDateTimeMonth',
 'creativeNameCotainsSkype',
 'Time_Communications',
 'AppCategoryNMinus5',
 'IsPenCapable',
 'IsDomainJoined',
 'InternalBatteryNumberOfCharges',
 'Time_lockapphost_exe',
 'BubbleShownTimeMonth',
 'InternalPrimaryDisplayLogicalDPIY',
 'BubbleShownTimeHour',
 'ISOCountryShortName',
 'Time_msascui_exe',
 'IsMalware',
 'Time_chrome_exe',
 'Time_Media',
 'DeviceTimezone']

In [75]:
from sklearn.model_selection import train_test_split
with Timer('# train validation split'):
    #X_train, X_val, y_train, y_val = train_test_split(train[train.isUnderSampled == True][features], train_y[train.isUnderSampled == True], test_size=0.2, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(train[features], train_y, test_size=0.15, random_state=0)
    
    gc.collect()

    print(y_train.shape)
    print(X_train.shape)
    print(y_val.shape)
    print(X_val.shape)
    
    print(y_train.mean())
    print(y_val.mean())
    
    #del train
    gc.collect()


# train validation split...
2017-04-07 06:06:23.126392
(1145111,)
(1145111, 94)
(202079,)
(202079, 94)
0.0182043487487
0.018883703898
# train validation split: cpu 4.55, time 4.55


In [76]:
#train_data = lgb.Dataset(X_train[X_train.isUnderSampled == True][features], label=X_train[X_train.isUnderSampled == True]['HasClicked'].values)

train_data = lgb.Dataset(X_train[features], label=y_train)
val_data = lgb.Dataset(X_val[features], y_val)

# use train holdout directly with t f ratio
#train_data = lgb.Dataset(train[features], label=train_y)
#val_data = lgb.Dataset(holdout[features], y_holdout)

print(X_train[features].shape)
print(X_val[features].shape)


(1145111, 94)
(202079, 94)

In [77]:
random.seed(2017)

params = {
    'task' : 'train',
    'boosting_type' : 'gbdt', #'gbdt', # dart
    'objective' : 'binary',
    'metric' : 'auc', # 'binary_logloss', #'binary_logloss', # binary_logloss, auc
    'is_training_metric': True,
    'max_bin': 255,
    'num_leaves' : 64,
    'learning_rate' : 0.05, # 0.05, #0.1,
    'feature_fraction' : 0.8,
    'min_data_in_leaf': 10,
    'min_sum_hessian_in_leaf': 5,
    # 'num_threads': 16,
}
num_round = 300


bst = lgb.train(params, train_data, num_round, valid_sets=val_data, early_stopping_rounds=30)

# dump model with txt
bst.save_model('model.txt', num_iteration=bst.best_iteration)

# dump model with pickle
#with open('model.pkl', 'wb') as fout:
#    pickle.dump(bst, fout)

val_preds = bst.predict(X_val[features], num_iteration=bst.best_iteration)
#holdout_preds = bst.predict(holdout[features], num_iteration=bst.best_iteration)
#test_preds = bst.predict(test[features], num_iteration=bst.best_iteration)

#0.7019 

# Including all high level and ymd and ymdh
# [297]	valid_0's auc:0.67564 F1 score: 0.096338028169, best thr: 0.325385385385, Click mean: 0.0343981839588

# without ymd; f1 score not improved, so keep this
# [201]	valid_0's auc:0.67772 F1 score: 0.0966780126125, best thr: 0.306746746747, Click mean: 0.0379598932823

# With uniqueUserDeviceKey valueCount
# [368]	valid_0's auc:0.664831 F1 score: 0.06x ???

# Value counts
# [525]	valid_0's auc:0.686445 f1 score: 0.104380886546 thr: 0.325875875876 Click mean: 0.0332386612486 (gain: 0.04)

# Count UniqueUserDeviceKey
# [505]	valid_0's auc:0.706443 f1 score: 0.128913201081 thr: 0.371491491491 Click mean: 0.0267462248702 (gain:0.024)

# Count CampaignId
# [544]	valid_0's auc:0.707357 f1 score: 0.13101569594 thr: 0.363643643644 Click mean: 0.0274719972684 (gain: 0.002)

# Remove all time and days
# [392]	valid_0's auc:0.703582 f1 score: 0.123669773283 thr: 0.378358358358 Click mean: 0.0266139148895

# Include imp time features
# [418]	valid_0's auc:0.706095 f1 score: 0.126989843694 thr: 0.386206206206 Click mean: 0.0229143624878 (loss: 0.004)

# High score submission
# [1409]	valid_0's auc:0.701104 local f1: f1 score: 0.143733567046 lb: 0.124101 (early stopping: 5)


[1]	valid_0's auc: 0.637387
Train until valid scores didn't improve in 30 rounds.
[2]	valid_0's auc: 0.65742
[3]	valid_0's auc: 0.661398
[4]	valid_0's auc: 0.659531
[5]	valid_0's auc: 0.663241
[6]	valid_0's auc: 0.668461
[7]	valid_0's auc: 0.66899
[8]	valid_0's auc: 0.671335
[9]	valid_0's auc: 0.670687
[10]	valid_0's auc: 0.671952
[11]	valid_0's auc: 0.670663
[12]	valid_0's auc: 0.673588
[13]	valid_0's auc: 0.673001
[14]	valid_0's auc: 0.672487
[15]	valid_0's auc: 0.671881
[16]	valid_0's auc: 0.672103
[17]	valid_0's auc: 0.671818
[18]	valid_0's auc: 0.673034
[19]	valid_0's auc: 0.67256
[20]	valid_0's auc: 0.673628
[21]	valid_0's auc: 0.672909
[22]	valid_0's auc: 0.672653
[23]	valid_0's auc: 0.672332
[24]	valid_0's auc: 0.672364
[25]	valid_0's auc: 0.671823
[26]	valid_0's auc: 0.671646
[27]	valid_0's auc: 0.671195
[28]	valid_0's auc: 0.670834
[29]	valid_0's auc: 0.67089
[30]	valid_0's auc: 0.671948
[31]	valid_0's auc: 0.671837
[32]	valid_0's auc: 0.671501
[33]	valid_0's auc: 0.671758
[34]	valid_0's auc: 0.671642
[35]	valid_0's auc: 0.671505
[36]	valid_0's auc: 0.671402
[37]	valid_0's auc: 0.671176
[38]	valid_0's auc: 0.671057
[39]	valid_0's auc: 0.671389
[40]	valid_0's auc: 0.671773
[41]	valid_0's auc: 0.672841
[42]	valid_0's auc: 0.674168
[43]	valid_0's auc: 0.67451
[44]	valid_0's auc: 0.67467
[45]	valid_0's auc: 0.675596
[46]	valid_0's auc: 0.675338
[47]	valid_0's auc: 0.675768
[48]	valid_0's auc: 0.676012
[49]	valid_0's auc: 0.676767
[50]	valid_0's auc: 0.677457
[51]	valid_0's auc: 0.677297
[52]	valid_0's auc: 0.67749
[53]	valid_0's auc: 0.678036
[54]	valid_0's auc: 0.677877
[55]	valid_0's auc: 0.677651
[56]	valid_0's auc: 0.677388
[57]	valid_0's auc: 0.677379
[58]	valid_0's auc: 0.677688
[59]	valid_0's auc: 0.677667
[60]	valid_0's auc: 0.677758
[61]	valid_0's auc: 0.677468
[62]	valid_0's auc: 0.677523
[63]	valid_0's auc: 0.678253
[64]	valid_0's auc: 0.678082
[65]	valid_0's auc: 0.678601
[66]	valid_0's auc: 0.678428
[67]	valid_0's auc: 0.67829
[68]	valid_0's auc: 0.67859
[69]	valid_0's auc: 0.679469
[70]	valid_0's auc: 0.679822
[71]	valid_0's auc: 0.680661
[72]	valid_0's auc: 0.681076
[73]	valid_0's auc: 0.681323
[74]	valid_0's auc: 0.681756
[75]	valid_0's auc: 0.681971
[76]	valid_0's auc: 0.682009
[77]	valid_0's auc: 0.682624
[78]	valid_0's auc: 0.683151
[79]	valid_0's auc: 0.683508
[80]	valid_0's auc: 0.68353
[81]	valid_0's auc: 0.683899
[82]	valid_0's auc: 0.684285
[83]	valid_0's auc: 0.684343
[84]	valid_0's auc: 0.685003
[85]	valid_0's auc: 0.685456
[86]	valid_0's auc: 0.685453
[87]	valid_0's auc: 0.686096
[88]	valid_0's auc: 0.686225
[89]	valid_0's auc: 0.686535
[90]	valid_0's auc: 0.686982
[91]	valid_0's auc: 0.687392
[92]	valid_0's auc: 0.687542
[93]	valid_0's auc: 0.687723
[94]	valid_0's auc: 0.688048
[95]	valid_0's auc: 0.688485
[96]	valid_0's auc: 0.688862
[97]	valid_0's auc: 0.68929
[98]	valid_0's auc: 0.689301
[99]	valid_0's auc: 0.689931
[100]	valid_0's auc: 0.690335
[101]	valid_0's auc: 0.690742
[102]	valid_0's auc: 0.690915
[103]	valid_0's auc: 0.691016
[104]	valid_0's auc: 0.691078
[105]	valid_0's auc: 0.691347
[106]	valid_0's auc: 0.691546
[107]	valid_0's auc: 0.691778
[108]	valid_0's auc: 0.69211
[109]	valid_0's auc: 0.692125
[110]	valid_0's auc: 0.692495
[111]	valid_0's auc: 0.692678
[112]	valid_0's auc: 0.692835
[113]	valid_0's auc: 0.693116
[114]	valid_0's auc: 0.69323
[115]	valid_0's auc: 0.693398
[116]	valid_0's auc: 0.693431
[117]	valid_0's auc: 0.69355
[118]	valid_0's auc: 0.693817
[119]	valid_0's auc: 0.693986
[120]	valid_0's auc: 0.694216
[121]	valid_0's auc: 0.694274
[122]	valid_0's auc: 0.694794
[123]	valid_0's auc: 0.694841
[124]	valid_0's auc: 0.695064
[125]	valid_0's auc: 0.695147
[126]	valid_0's auc: 0.695268
[127]	valid_0's auc: 0.695523
[128]	valid_0's auc: 0.695811
[129]	valid_0's auc: 0.69585
[130]	valid_0's auc: 0.696001
[131]	valid_0's auc: 0.696392
[132]	valid_0's auc: 0.696548
[133]	valid_0's auc: 0.696542
[134]	valid_0's auc: 0.696635
[135]	valid_0's auc: 0.69672
[136]	valid_0's auc: 0.696802
[137]	valid_0's auc: 0.696772
[138]	valid_0's auc: 0.69696
[139]	valid_0's auc: 0.697152
[140]	valid_0's auc: 0.697208
[141]	valid_0's auc: 0.697379
[142]	valid_0's auc: 0.697608
[143]	valid_0's auc: 0.697801
[144]	valid_0's auc: 0.698105
[145]	valid_0's auc: 0.698304
[146]	valid_0's auc: 0.698394
[147]	valid_0's auc: 0.698454
[148]	valid_0's auc: 0.698506
[149]	valid_0's auc: 0.69856
[150]	valid_0's auc: 0.69852
[151]	valid_0's auc: 0.698816
[152]	valid_0's auc: 0.698854
[153]	valid_0's auc: 0.699118
[154]	valid_0's auc: 0.699288
[155]	valid_0's auc: 0.699385
[156]	valid_0's auc: 0.699498
[157]	valid_0's auc: 0.699632
[158]	valid_0's auc: 0.699812
[159]	valid_0's auc: 0.699865
[160]	valid_0's auc: 0.699903
[161]	valid_0's auc: 0.699944
[162]	valid_0's auc: 0.700101
[163]	valid_0's auc: 0.700185
[164]	valid_0's auc: 0.700216
[165]	valid_0's auc: 0.700388
[166]	valid_0's auc: 0.700478
[167]	valid_0's auc: 0.700439
[168]	valid_0's auc: 0.7005
[169]	valid_0's auc: 0.70054
[170]	valid_0's auc: 0.700766
[171]	valid_0's auc: 0.700823
[172]	valid_0's auc: 0.700859
[173]	valid_0's auc: 0.700874
[174]	valid_0's auc: 0.700905
[175]	valid_0's auc: 0.700936
[176]	valid_0's auc: 0.701101
[177]	valid_0's auc: 0.701164
[178]	valid_0's auc: 0.701279
[179]	valid_0's auc: 0.701408
[180]	valid_0's auc: 0.701507
[181]	valid_0's auc: 0.701481
[182]	valid_0's auc: 0.701531
[183]	valid_0's auc: 0.70162
[184]	valid_0's auc: 0.701653
[185]	valid_0's auc: 0.701636
[186]	valid_0's auc: 0.701892
[187]	valid_0's auc: 0.701906
[188]	valid_0's auc: 0.702068
[189]	valid_0's auc: 0.702071
[190]	valid_0's auc: 0.702053
[191]	valid_0's auc: 0.70212
[192]	valid_0's auc: 0.702178
[193]	valid_0's auc: 0.702275
[194]	valid_0's auc: 0.702264
[195]	valid_0's auc: 0.702322
[196]	valid_0's auc: 0.702394
[197]	valid_0's auc: 0.70235
[198]	valid_0's auc: 0.702341
[199]	valid_0's auc: 0.702434
[200]	valid_0's auc: 0.702651
[201]	valid_0's auc: 0.702539
[202]	valid_0's auc: 0.702624
[203]	valid_0's auc: 0.702682
[204]	valid_0's auc: 0.702785
[205]	valid_0's auc: 0.702804
[206]	valid_0's auc: 0.702768
[207]	valid_0's auc: 0.702906
[208]	valid_0's auc: 0.702966
[209]	valid_0's auc: 0.703012
[210]	valid_0's auc: 0.703163
[211]	valid_0's auc: 0.703283
[212]	valid_0's auc: 0.703338
[213]	valid_0's auc: 0.703345
[214]	valid_0's auc: 0.703281
[215]	valid_0's auc: 0.703293
[216]	valid_0's auc: 0.703284
[217]	valid_0's auc: 0.703334
[218]	valid_0's auc: 0.703305
[219]	valid_0's auc: 0.703471
[220]	valid_0's auc: 0.703511
[221]	valid_0's auc: 0.703398
[222]	valid_0's auc: 0.703417
[223]	valid_0's auc: 0.703488
[224]	valid_0's auc: 0.703535
[225]	valid_0's auc: 0.703435
[226]	valid_0's auc: 0.703484
[227]	valid_0's auc: 0.703532
[228]	valid_0's auc: 0.703691
[229]	valid_0's auc: 0.703503
[230]	valid_0's auc: 0.703514
[231]	valid_0's auc: 0.703534
[232]	valid_0's auc: 0.703613
[233]	valid_0's auc: 0.703725
[234]	valid_0's auc: 0.70375
[235]	valid_0's auc: 0.703742
[236]	valid_0's auc: 0.703838
[237]	valid_0's auc: 0.703825
[238]	valid_0's auc: 0.703942
[239]	valid_0's auc: 0.703985
[240]	valid_0's auc: 0.704104
[241]	valid_0's auc: 0.704053
[242]	valid_0's auc: 0.704045
[243]	valid_0's auc: 0.704022
[244]	valid_0's auc: 0.703978
[245]	valid_0's auc: 0.703943
[246]	valid_0's auc: 0.703975
[247]	valid_0's auc: 0.703948
[248]	valid_0's auc: 0.703968
[249]	valid_0's auc: 0.704099
[250]	valid_0's auc: 0.704173
[251]	valid_0's auc: 0.704104
[252]	valid_0's auc: 0.704089
[253]	valid_0's auc: 0.704102
[254]	valid_0's auc: 0.704113
[255]	valid_0's auc: 0.704178
[256]	valid_0's auc: 0.704219
[257]	valid_0's auc: 0.704287
[258]	valid_0's auc: 0.704375
[259]	valid_0's auc: 0.704394
[260]	valid_0's auc: 0.704417
[261]	valid_0's auc: 0.704272
[262]	valid_0's auc: 0.704195
[263]	valid_0's auc: 0.704269
[264]	valid_0's auc: 0.704337
[265]	valid_0's auc: 0.704284
[266]	valid_0's auc: 0.704291
[267]	valid_0's auc: 0.704277
[268]	valid_0's auc: 0.704236
[269]	valid_0's auc: 0.70419
[270]	valid_0's auc: 0.704249
[271]	valid_0's auc: 0.704192
[272]	valid_0's auc: 0.704179
[273]	valid_0's auc: 0.704302
[274]	valid_0's auc: 0.704363
[275]	valid_0's auc: 0.704423
[276]	valid_0's auc: 0.70435
[277]	valid_0's auc: 0.704375
[278]	valid_0's auc: 0.704423
[279]	valid_0's auc: 0.704489
[280]	valid_0's auc: 0.704525
[281]	valid_0's auc: 0.704603
[282]	valid_0's auc: 0.704591
[283]	valid_0's auc: 0.704624
[284]	valid_0's auc: 0.704708
[285]	valid_0's auc: 0.704799
[286]	valid_0's auc: 0.704979
[287]	valid_0's auc: 0.70488
[288]	valid_0's auc: 0.704762
[289]	valid_0's auc: 0.704816
[290]	valid_0's auc: 0.704832
[291]	valid_0's auc: 0.704894
[292]	valid_0's auc: 0.704895
[293]	valid_0's auc: 0.704892
[294]	valid_0's auc: 0.704973
[295]	valid_0's auc: 0.704965
[296]	valid_0's auc: 0.70498
[297]	valid_0's auc: 0.705054
[298]	valid_0's auc: 0.705132
[299]	valid_0's auc: 0.705231
[300]	valid_0's auc: 0.705259

In [78]:
print('Validaion')
val_best_threshold = f1_best_threshold(y_val, val_preds)

#f1 score: 0.143733567046
#best threshold: 0.0673873873874
#TF pred mean: 0.0149941359567


Validaion
f1 score: 0.14913280162
best threshold: 0.0634634634635
TF pred mean: 0.0202049693437

In [79]:
bst.feature_importance()


Out[79]:
array([251, 448,  13, 243, 171, 728, 107,   5,  16, 125, 368, 234, 327,
       264,  88, 226, 193,  83,  18, 223, 113, 207,  38,   2, 213, 321,
       487,  57,  90,   6,  24, 136,  92,  22,  30, 147, 184,  35, 614,
       398, 157,  52, 433, 265,   8, 154, 271,  74, 118, 191,  23, 138,
       667,   9, 272, 181,  69, 205, 327,  12,  56,  88,   0, 176, 534,
        15, 222,   0, 585,  17, 489, 190, 718,  74, 383, 181, 347, 371,
        17, 186, 171,  13,  40, 193, 265,  24,   1, 340, 883, 280,   0,
       289, 287, 492])

In [80]:
#feature_list = X_val[features].columns.values
feature_list = X_val[features].columns.values
df_fi = pd.DataFrame(bst.feature_importance(), columns=['importance'])
df_fi['feature'] = feature_list
df_fi = df_fi.sort_values('importance', ascending = 0)
df_fi[df_fi.importance >= 10]


Out[80]:
importance feature
88 883 ISOCountryShortName
5 728 LifeTimeConversionCount
72 718 LifeTimeImpressionCount
52 667 UniqueUserDeviceKey
38 614 CampaignId
68 585 CampaignIdValueCount
64 534 UniqueUserDeviceKeyValueCount
93 492 DeviceTimezone
70 489 OSOOBEDateTimeDay
26 487 ProcessorClockSpeed
1 448 FirstUpdatedDateDay
42 433 Time_windows_immersivecontrolpanel
39 398 creativeName
74 383 FirstUpdatedDateHour
77 371 OSOOBEDateTimeMonth
10 368 OSOOBEDateTimeHour
76 347 InternalPrimaryDiagonalDisplaySizeInInches
87 340 BubbleShownTimeHour
12 327 DisplayLanguage
58 327 Time_ModernApps
25 321 BubbleShownTimeDay
91 289 Time_chrome_exe
92 287 Time_Media
89 280 Time_msascui_exe
54 272 FirstUpdatedDateMonth
46 271 Time_microsoft_windows_cortana
84 265 Time_lockapphost_exe
43 265 Time_PersonalProductivity
13 264 TotalEngagementTimeInSec
0 251 GamerSegment
... ... ...
48 118 InternalPrimaryDisplayResolutionHorizontal
20 113 GamerPCClassification
6 107 Time_Games_Casual
32 92 creativeNameCotainsBattery
28 90 OSOOBEDateTimeYear
14 88 creativeNameCotainsSL
61 88 FrontFacingCameraResolution
17 83 Time_Social
47 74 Time_DevTools
73 74 ProcessorCores
56 69 AppCatCntNs
27 57 ProcessorPhysicalCores
60 56 creativeNameCotainsSwitch
41 52 creativeNameCotainsSurge
82 40 IsDomainJoined
22 38 Time_Accessibility
37 35 IsTouchEnabled
34 30 creativeNameCotainsMeet
85 24 BubbleShownTimeMonth
30 24 IsEducation
50 23 creativeNameCotainsSecurity
33 22 NumberofExternalDisplays
18 18 NumberofInternalDisplays
69 17 creativeNameCotainsWeek7
78 17 creativeNameCotainsSkype
8 16 InternalPrimaryDisplayLogicalDPIX
65 15 HasFrontFacingCamera
81 13 IsPenCapable
2 13 Time_StudentAndLearning
59 12 IsVirtualDevice

85 rows × 2 columns


In [25]:
zeroImportance = df_fi[df_fi.importance == 0]['feature'].values
print(len(zeroImportance))


1

In [29]:
bst = lgb.Booster(model_file="model.txt")

In [91]:
bst


Out[91]:
<lightgbm.basic.Booster at 0x2603eb4eb00>

In [81]:
with Timer('# predict test data'):
    preds = bst.predict(test[features], num_iteration=bst.best_iteration)


# predict test data...
2017-04-07 06:10:56.475872
# predict test data: cpu 5.86, time 5.86


In [82]:
#print(bestEpsilon)
print(val_best_threshold)


0.0634634634635

In [83]:
test_id = test.RowNumber.values
submission = pd.DataFrame({'RowNumber': test_id})
submission['HasClicked'] = preds > val_best_threshold
print("Click mean:", submission.HasClicked.mean())
print("Submission file...")
submission.to_csv("W10_Coin_LightGBM_FinalV2_lr0.05_0403.csv", index = False)
submission.head()


Click mean: 0.0259797881525
Submission file...
Out[83]:
RowNumber HasClicked
0 1 False
1 2 False
2 3 False
3 4 False
4 5 False

In [96]:
print(test_header)
print(test_header2)
print(list(set(test_header) - set(test_header2)))


Index(['RowNumber', 'BubbleShownTime', 'CampaignId', 'creativeName',
       'UniqueUserDeviceKey', 'LifeTimeImpressionCount',
       'LifeTimeConversionCount', 'RecentMonthConversionCount',
       'IsVirtualDevice', 'ISOCountryShortName',
       ...
       'Time_Social', 'Days_Social', 'Time_StudentAndLearning',
       'Days_StudentAndLearning', 'Time_ModernApps', 'Days_ModernApps',
       'Time_Games_Core', 'Days_Games_Core', 'Time_Games_Casual',
       'Days_Games_Casual'],
      dtype='object', length=1084)
Index(['RowNumber', 'BubbleShownTime', 'CampaignId', 'creativeName',
       'UniqueUserDeviceKey', 'LifeTimeImpressionCount',
       'LifeTimeConversionCount', 'RecentMonthConversionCount',
       'IsVirtualDevice', 'ISOCountryShortName',
       ...
       'Time_Social', 'Days_Social', 'Time_StudentAndLearning',
       'Days_StudentAndLearning', 'Time_ModernApps', 'Days_ModernApps',
       'Time_Games_Core', 'Days_Games_Core', 'Time_Games_Casual',
       'Days_Games_Casual'],
      dtype='object', length=1084)
[]

In [18]:
bst.best_iteration


Out[18]:
457

In [97]:
# Read tsv file
test2 = pd.read_csv('../input/CoinMlCompetitionSoftlandingEvaluateNoLabel.tsv', sep='\t', header = None)

# Add header because test does not have header
with open ('test_header', 'rb') as fp:
    test_header2 = pickle.load(fp)

test2.columns = test_header2

# Get feature list
with open ('feature_list', 'rb') as fp:
    features2 = pickle.load(fp)

# Reduce test size by leaving train features only
test2 = test2[list(set(features2) - set(['HasClicked']))]

# Feature engineering - should not delete odd date rows
#test = feature_engineering(test, isDeleteOddDateRows=False)
test2 = feature_engineering(test2)

print(test2.shape)


C:\Users\hyunor\AppData\Local\Continuum\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (2,17,33) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
shape: (374137, 71)
Add datetime features...
BubbleShownTime
FirstUpdatedDate
OSOOBEDateTime
shape: (374137, 83)
Missing value count...
Gamer segment parsing...
Convert categorical columns to numeric...
AppCategoryNMinus7
DisplayLanguage
IsAlwaysOnAlwaysConnectedCapable
AppCategoryNMinus2
creativeName
AppCategoryNMinus5
AppCategoryNMinus1
AppCategoryNMinus6
AppCategoryNMinus4
BubbleShownTime
GamerSegment
ISOCountryShortName
AppCategoryNMinus8
IsDomainJoined
GamerPCClassification
DeviceTimezone
DefaultInternetBrowser
CampaignId
AppCategoryNMinus3
GamerSegment1
creativeNameCotainsSL
creativeNameCotainsTS
creativeNameCotainsWeek7
creativeNameCotainsWeek 7
creativeNameCotainsMeet
creativeNameCotainsSkype
creativeNameCotainsBattery
creativeNameCotainsSwitch
creativeNameCotainsPerformance
creativeNameCotainsSecurity
creativeNameCotainsSurge
Replace missing values with -1
Value count...
UniqueUserDeviceKey
CampaignId
(374137, 100)

In [118]:
df_header = pd.DataFrame(test_header)
df_header.to_csv('test_header.csv', index = False)

#df_header = pd.read_csv('test_header.csv')
#header = df_header['0'].values

In [106]:
df_features = pd.DataFrame(features)
df_features.to_csv('final_features.csv', index = False)

In [108]:
df_feats = pd.read_csv('final_features.csv')
features_final2 = df_feats['0'].values

In [111]:
features_final2 = df_feats['0'].values

In [98]:
# Read final features
with open ('final_feature_list', 'rb') as fp:
    features_final2 = pickle.load(fp)

In [102]:
print(features)
print(features_final2)
# Read tsv file
test2 = pd.read_csv('../input/CoinMlCompetitionSoftlandingEvaluateNoLabel.tsv', sep='\t', header = None)
print(list(set(features)-set(features_final2)))


['AppCategoryNMinus7', 'Time_Search', 'Time_Malware', 'IsPenCapable', 'InternalPrimaryDisplayResolutionVertical', 'Time_ModernApps', 'TotalDays', 'Time_msascui_exe', 'OSOOBEDateTimeHour', 'Time_Communications', 'CntNs', 'BubbleShownTimeMonth', 'ProcessorClockSpeed', 'creativeNameCotainsMeet', 'FirstUpdatedDateHour', 'InternalPrimaryDisplayLogicalDPIY', 'DisplayLanguage', 'AppCatCntNs', 'Time_Accessibility', 'IsAlwaysOnAlwaysConnectedCapable', 'Time_Readers', 'AppCategoryNMinus2', 'CampaignIdValueCount', 'creativeNameCotainsSurge', 'HasFrontFacingCamera', 'IsCommercial', 'Time_Games', 'creativeNameCotainsWeek7', 'creativeName', 'BubbleShownTimeDay', 'AppCategoryNMinus5', 'creativeNameCotainsSwitch', 'Time_Browser', 'Time_Social', 'TotalEngagementTimeInSec', 'AppCategoryNMinus1', 'AppCategoryNMinus6', 'BubbleShownTimeYear', 'BubbleShownTimeHour', 'AppCategoryNMinus4', 'FirstUpdatedDateYear', 'FirstUpdatedDateMonth', 'Time_Games_Core', 'OSOOBEDateTimeYear', 'Time_Media', 'creativeNameCotainsSkype', 'PrimaryDiskTotalCapacity', 'FrontFacingCameraResolution', 'OSOOBEDateTimeDay', 'GamerSegment', 'InternalPrimaryDisplayResolutionHorizontal', 'Time_Games_Casual', 'NumberofInternalDisplays', 'InternalBatteryNumberOfCharges', 'creativeNameCotainsBattery', 'LifeTimeImpressionCount', 'ProcessorCores', 'IsTouchEnabled', 'Time_PersonalProductivity', 'InternalPrimaryDisplayLogicalDPIX', 'ProcessorPhysicalCores', 'IsEducation', 'ISOCountryShortName', 'Time_lockapphost_exe', 'InternalPrimaryDiagonalDisplaySizeInInches', 'NumberofExternalDisplays', 'AppCategoryNMinus8', 'Time_StudentAndLearning', 'IsDomainJoined', 'GamerPCClassification', 'Time_windows_immersivecontrolpanel', 'Time_chrome_exe', 'OSOOBEDateTimeMonth', 'Time_DevTools', 'Time_microsoft_windows_cortana', 'GamerSegment1', 'Time_Content', 'UniqueUserDeviceKey', 'LifeTimeConversionCount', 'DeviceTimezone', 'IsMalware', 'creativeNameCotainsTS', 'HasRearFacingCamera', 'creativeNameCotainsWeek 7', 'RecentMonthConversionCount', 'creativeNameCotainsSecurity', 'UniqueUserDeviceKeyValueCount', 'DefaultInternetBrowser', 'creativeNameCotainsPerformance', 'creativeNameCotainsSL', 'TotalPhysicalRAM', 'FirstUpdatedDateDay', 'CampaignId', 'AppCategoryNMinus3', 'IsVirtualDevice']
['IsPenCapable', 'Time_Games_Core', 'creativeNameCotainsPerformance', 'AppCategoryNMinus8', 'Time_PersonalProductivity', 'BubbleShownTimeHour', 'Time_StudentAndLearning', 'BubbleShownTimeYear', 'Time_Media', 'AppCategoryNMinus7', 'IsVirtualDevice', 'CampaignIdValueCount', 'CntNs', 'AppCatCntNs', 'TotalDays', 'LifeTimeConversionCount', 'InternalPrimaryDiagonalDisplaySizeInInches', 'Time_Communications', 'Time_Search', 'creativeNameCotainsSwitch', 'NumberofExternalDisplays', 'Time_microsoft_windows_cortana', 'creativeNameCotainsWeek7', 'AppCategoryNMinus6', 'UniqueUserDeviceKeyValueCount', 'IsAlwaysOnAlwaysConnectedCapable', 'TotalPhysicalRAM', 'GamerSegment', 'UniqueUserDeviceKey', 'PrimaryDiskTotalCapacity', 'Time_chrome_exe', 'Time_Accessibility', 'creativeNameCotainsSecurity', 'Time_msascui_exe', 'Time_Social', 'AppCategoryNMinus5', 'Time_Content', 'FirstUpdatedDateDay', 'IsCommercial', 'AppCategoryNMinus3', 'BubbleShownTimeDay', 'OSOOBEDateTimeMonth', 'creativeNameCotainsSkype', 'FirstUpdatedDateMonth', 'ISOCountryShortName', 'creativeNameCotainsMeet', 'ProcessorPhysicalCores', 'AppCategoryNMinus2', 'OSOOBEDateTimeDay', 'AppCategoryNMinus4', 'IsDomainJoined', 'Time_windows_immersivecontrolpanel', 'ProcessorClockSpeed', 'creativeNameCotainsSurge', 'CampaignId', 'InternalPrimaryDisplayResolutionHorizontal', 'DisplayLanguage', 'Time_Malware', 'creativeNameCotainsSL', 'OSOOBEDateTimeHour', 'ProcessorCores', 'Time_Games', 'Time_lockapphost_exe', 'DeviceTimezone', 'IsMalware', 'GamerSegment1', 'BubbleShownTimeMonth', 'Time_Games_Casual', 'IsEducation', 'HasRearFacingCamera', 'creativeNameCotainsWeek 7', 'InternalBatteryNumberOfCharges', 'InternalPrimaryDisplayLogicalDPIX', 'GamerPCClassification', 'OSOOBEDateTimeYear', 'Time_Browser', 'creativeNameCotainsTS', 'Time_DevTools', 'creativeNameCotainsBattery', 'IsTouchEnabled', 'DefaultInternetBrowser', 'FirstUpdatedDateYear', 'InternalPrimaryDisplayResolutionVertical', 'NumberofInternalDisplays', 'InternalPrimaryDisplayLogicalDPIY', 'creativeName', 'FirstUpdatedDateHour', 'TotalEngagementTimeInSec', 'RecentMonthConversionCount', 'LifeTimeImpressionCount', 'Time_Readers', 'HasFrontFacingCamera', 'FrontFacingCameraResolution', 'Time_ModernApps', 'AppCategoryNMinus1']
[]

In [127]:
initial_features


Out[127]:
array(['AppCategoryNMinus7', 'IsPenCapable', 'GamerSegment',
       'InternalPrimaryDisplayResolutionHorizontal',
       'InternalPrimaryDisplayResolutionVertical', 'Time_Search',
       'Time_Malware', 'NumberofInternalDisplays',
       'InternalBatteryNumberOfCharges', 'Time_Games_Casual',
       'Time_ModernApps', 'TotalDays', 'FrontFacingCameraResolution',
       'LifeTimeImpressionCount', 'IsTouchEnabled', 'ProcessorCores',
       'OSOOBEDateTime', 'InternalPrimaryDisplayLogicalDPIX',
       'IsEducation', 'ProcessorPhysicalCores', 'Time_msascui_exe',
       'Time_PersonalProductivity', 'Time_Communications',
       'ISOCountryShortName', 'ProcessorClockSpeed',
       'NumberofExternalDisplays',
       'InternalPrimaryDiagonalDisplaySizeInInches', 'AppCategoryNMinus8',
       'InternalPrimaryDisplayLogicalDPIY', 'DisplayLanguage',
       'Time_lockapphost_exe', 'Time_Accessibility',
       'IsAlwaysOnAlwaysConnectedCapable', 'Time_StudentAndLearning',
       'IsDomainJoined', 'Time_Readers', 'GamerPCClassification',
       'Time_windows_immersivecontrolpanel', 'Time_chrome_exe',
       'Time_DevTools', 'AppCategoryNMinus2',
       'Time_microsoft_windows_cortana', 'HasFrontFacingCamera',
       'IsCommercial', 'Time_Content', 'Time_Games', 'creativeName',
       'UniqueUserDeviceKey', 'AppCategoryNMinus5',
       'LifeTimeConversionCount', 'DeviceTimezone', 'IsMalware',
       'Time_Browser', 'Time_Social', 'HasRearFacingCamera',
       'TotalEngagementTimeInSec', 'RecentMonthConversionCount',
       'DefaultInternetBrowser', 'AppCategoryNMinus1',
       'AppCategoryNMinus6', 'AppCategoryNMinus4', 'FirstUpdatedDate',
       'PrimaryDiskTotalCapacity', 'TotalPhysicalRAM', 'Time_Games_Core',
       'HasClicked', 'BubbleShownTime', 'CampaignId', 'RowNumber',
       'Time_Media', 'AppCategoryNMinus3', 'IsVirtualDevice'], dtype=object)

In [34]:
# Read tsv file
test2 = pd.read_csv('CoinMlCompetitionSoftlandingEvaluateNoLabel.tsv', sep='\t', header = None)

# Add header because test does not header
df_header = pd.read_csv('test_header.csv')
test_header2 = df_header['0'].values

test2.columns = test_header2

# Reduce test size by leaving train features only
df_initial_features = pd.read_csv('initial_features.csv')
initial_features2 = df_initial_features['0'].values

test2 = test2[list(set(initial_features2) - set(['HasClicked']))]

# Feature engineering - should not delete odd date rows
#test = feature_engineering(test, isDeleteOddDateRows=False)
test2 = feature_engineering(test2)

#random.seed(2007)
bst2 = lgb.Booster(model_file="model.txt") 

# Predict test
df_final_features = pd.read_csv('final_features.csv')
final_features2 = df_final_features['0'].values

preds = bst2.predict(test2[final_features2], num_iteration=457)

# Best threshold from train
#val_best_threshold = 0.072
val_best_threshold = 0.075

# Create submissin file
test_id = test2.RowNumber.values
submission = pd.DataFrame({'RowNumber': test_id})
submission['HasClicked'] = preds > val_best_threshold
print("Click mean:", submission.HasClicked.mean())
print("Submission file...")
submission.to_csv("W10_Coin_test_prediction_0403.csv", index = False)
submission.head()


C:\Users\ryutek\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2717: DtypeWarning: Columns (2,17,33) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
shape: (374137, 71)
Add datetime features...
BubbleShownTime
FirstUpdatedDate
OSOOBEDateTime
shape: (374137, 83)
Missing value count...
Gamer segment parsing...
Convert categorical columns to numeric...
CampaignId
BubbleShownTime
AppCategoryNMinus2
ISOCountryShortName
IsAlwaysOnAlwaysConnectedCapable
AppCategoryNMinus4
AppCategoryNMinus5
GamerPCClassification
GamerSegment
IsDomainJoined
AppCategoryNMinus6
DeviceTimezone
DisplayLanguage
AppCategoryNMinus3
AppCategoryNMinus7
DefaultInternetBrowser
AppCategoryNMinus8
AppCategoryNMinus1
creativeName
GamerSegment1
creativeNameCotainsSL
creativeNameCotainsTS
creativeNameCotainsWeek7
creativeNameCotainsWeek 7
creativeNameCotainsMeet
creativeNameCotainsSkype
creativeNameCotainsBattery
creativeNameCotainsSwitch
creativeNameCotainsPerformance
creativeNameCotainsSecurity
creativeNameCotainsSurge
Replace missing values with -1
Value count...
UniqueUserDeviceKey
CampaignId
---------------------------------------------------------------------------
LightGBMError                             Traceback (most recent call last)
<ipython-input-34-16b19be74354> in <module>()
     19 
     20 #random.seed(2007)
---> 21 bst2 = lgb.Booster(model_file="model.txt")
     22 
     23 # Predict test

C:\Users\ryutek\Anaconda3\lib\site-packages\lightgbm-0.1-py3.6.egg\lightgbm\basic.py in __init__(self, params, train_set, model_file, silent)
   1221                 c_str(model_file),
   1222                 ctypes.byref(out_num_iterations),
-> 1223                 ctypes.byref(self.handle)))
   1224             out_num_class = ctypes.c_int(0)
   1225             _safe_call(_LIB.LGBM_BoosterGetNumClasses(

C:\Users\ryutek\Anaconda3\lib\site-packages\lightgbm-0.1-py3.6.egg\lightgbm\basic.py in _safe_call(ret)
     45     """
     46     if ret != 0:
---> 47         raise LightGBMError(_LIB.LGBM_GetLastError())
     48 
     49 

LightGBMError: b'Wrong size of feature_names'

In [120]:
# Read trained model
#random.seed(2007)
bst3 = lgb.Booster(model_file="model.txt") 

# Predict test
preds = bst3.predict(test2[features], num_iteration=bst.best_iteration)

# Best threshold from train
#val_best_threshold = 0.072
val_best_threshold = 0.075

# Create submissin file
test_id = test2.RowNumber.values
submission = pd.DataFrame({'RowNumber': test_id})
submission['HasClicked'] = preds > val_best_threshold
print("Click mean:", submission.HasClicked.mean())
print("Submission file...")
submission.to_csv("W10_Coin_test_prediction_0403.csv", index = False)
print(submission.head())
print(preds)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-120-cde729c4b042> in <module>()
      4 
      5 # Predict test
----> 6 preds = bst3.predict(test2[features], num_iteration=bst.best_iteration)
      7 
      8 # Best threshold from train

C:\Users\hyunor\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2051         if isinstance(key, (Series, np.ndarray, Index, list)):
   2052             # either boolean or fancy integer index
-> 2053             return self._getitem_array(key)
   2054         elif isinstance(key, DataFrame):
   2055             return self._getitem_frame(key)

C:\Users\hyunor\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key)
   2095             return self.take(indexer, axis=0, convert=False)
   2096         else:
-> 2097             indexer = self.ix._convert_to_indexer(key, axis=1)
   2098             return self.take(indexer, axis=1, convert=True)
   2099 

C:\Users\hyunor\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1228                 mask = check == -1
   1229                 if mask.any():
-> 1230                     raise KeyError('%s not in index' % objarr[mask])
   1231 
   1232                 return _values_from_object(indexer)

KeyError: "['HasClicked'] not in index"

In [ ]:
# load model with pickle to predict
with open('model.pkl', 'rb') as fin:
    pkl_bst = pickle.load(fin)
    
# Predict test
preds = pkl_bst.predict(test[features], num_iteration=bst.best_iteration)

# Best threshold from train
#val_best_threshold = 0.072
val_best_threshold = 0.075

# Create submissin file
test_id = test.RowNumber.values
submission = pd.DataFrame({'RowNumber': test_id})
submission['HasClicked'] = preds > val_best_threshold
print("Click mean:", submission.HasClicked.mean())
print("Submission file...")
submission.to_csv("W10_Coin_test_prediction_0403.csv", index = False)
submission.head()

In [ ]:


In [48]:
preds


Out[48]:
array([ 0.12908386,  0.0765627 ,  0.10802078, ...,  0.10601138,
        0.10591388,  0.09853806], dtype=float32)

In [23]:
features


Out[23]:
['IsPenCapable',
 'Time_Games_Core',
 'creativeNameCotainsPerformance',
 'AppCategoryNMinus8',
 'Time_PersonalProductivity',
 'BubbleShownTimeHour',
 'Time_StudentAndLearning',
 'BubbleShownTimeYear',
 'Time_Media',
 'AppCategoryNMinus7',
 'IsVirtualDevice',
 'CampaignIdValueCount',
 'CntNs',
 'AppCatCntNs',
 'TotalDays',
 'LifeTimeConversionCount',
 'InternalPrimaryDiagonalDisplaySizeInInches',
 'Time_Communications',
 'Time_Search',
 'creativeNameCotainsSwitch',
 'NumberofExternalDisplays',
 'Time_microsoft_windows_cortana',
 'creativeNameCotainsWeek7',
 'AppCategoryNMinus6',
 'UniqueUserDeviceKeyValueCount',
 'IsAlwaysOnAlwaysConnectedCapable',
 'TotalPhysicalRAM',
 'GamerSegment',
 'UniqueUserDeviceKey',
 'PrimaryDiskTotalCapacity',
 'Time_chrome_exe',
 'Time_Accessibility',
 'creativeNameCotainsSecurity',
 'Time_msascui_exe',
 'Time_Social',
 'AppCategoryNMinus5',
 'Time_Content',
 'FirstUpdatedDateDay',
 'IsCommercial',
 'AppCategoryNMinus3',
 'BubbleShownTimeDay',
 'OSOOBEDateTimeMonth',
 'creativeNameCotainsSkype',
 'FirstUpdatedDateMonth',
 'ISOCountryShortName',
 'creativeNameCotainsMeet',
 'ProcessorPhysicalCores',
 'AppCategoryNMinus2',
 'OSOOBEDateTimeDay',
 'AppCategoryNMinus4',
 'IsDomainJoined',
 'Time_windows_immersivecontrolpanel',
 'ProcessorClockSpeed',
 'creativeNameCotainsSurge',
 'CampaignId',
 'InternalPrimaryDisplayResolutionHorizontal',
 'DisplayLanguage',
 'Time_Malware',
 'creativeNameCotainsSL',
 'OSOOBEDateTimeHour',
 'ProcessorCores',
 'Time_Games',
 'Time_lockapphost_exe',
 'DeviceTimezone',
 'IsMalware',
 'GamerSegment1',
 'BubbleShownTimeMonth',
 'Time_Games_Casual',
 'IsEducation',
 'HasRearFacingCamera',
 'creativeNameCotainsWeek 7',
 'InternalBatteryNumberOfCharges',
 'InternalPrimaryDisplayLogicalDPIX',
 'GamerPCClassification',
 'OSOOBEDateTimeYear',
 'Time_Browser',
 'creativeNameCotainsTS',
 'Time_DevTools',
 'creativeNameCotainsBattery',
 'IsTouchEnabled',
 'DefaultInternetBrowser',
 'FirstUpdatedDateYear',
 'InternalPrimaryDisplayResolutionVertical',
 'NumberofInternalDisplays',
 'InternalPrimaryDisplayLogicalDPIY',
 'creativeName',
 'FirstUpdatedDateHour',
 'TotalEngagementTimeInSec',
 'RecentMonthConversionCount',
 'LifeTimeImpressionCount',
 'Time_Readers',
 'HasFrontFacingCamera',
 'FrontFacingCameraResolution',
 'Time_ModernApps',
 'AppCategoryNMinus1']

In [38]:
txt = 'abc f'
hash(txt)


Out[38]:
-6139562895366302631

In [39]:
hash(txt)


Out[39]:
-6139562895366302631

In [40]:
random.seed(2000)
hash(txt)


Out[40]:
-6139562895366302631

In [65]:
import hashlib
txt = b'abc f'
def sha256_hash_as_int(s):
    return int(hashlib.sha256(str(s).encode('utf-8')).hexdigest(), 16) % (10 ** 16)
sha256_hash_as_int(txt)


Out[65]:
9655835737071027

In [53]:
import hashlib
txt = b'abc f'
def sha256_hash_as_int(s):
    return int(hashlib.sha256(s).digest(), 4)
sha256_hash_as_int(txt)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-53-d41423e0a2d7> in <module>()
      3 def sha256_hash_as_int(s):
      4     return int(hashlib.sha256(s).digest(), 4)
----> 5 sha256_hash_as_int(txt)

<ipython-input-53-d41423e0a2d7> in sha256_hash_as_int(s)
      2 txt = b'abc f'
      3 def sha256_hash_as_int(s):
----> 4     return int(hashlib.sha256(s).digest(), 4)
      5 sha256_hash_as_int(txt)

ValueError: invalid literal for int() with base 4: b'\xa9`\xb4\xf3\xeb\xc1*\x08\xe4FrS\n\xd9\x80=\xa8y\x17\x1eb-\x06\xd8\x1a\xd2\xab\xe8M)\xee\x83'

In [62]:
int(hashlib.sha1(txt).hexdigest(), 16) % (10 ** 16)


Out[62]:
2651114539135181

In [63]:
import hashlib
txt = b'abc f'
int(hashlib.sha1(txt).hexdigest(), 16) % (10 ** 16)


Out[63]:
2651114539135181

In [ ]: